This code follows the protocol for differential expression analysis of RNA sequencing data using R and Bioconductor given in Anders et al (2013).
# Load the data
samples = read.table("samples.csv", sep=";", header=TRUE)
dim(samples)
## [1] 7 7
counts = read.csv("counts.csv")
dim(counts)
## [1] 7196 7
library(edgeR)
counts = readDGE(samples$countf)$counts
dim(counts)
## [1] 15686 7
# Filter weakly expressed and noninformative (e.g., non-aligned) features
noint = rownames(counts) %in%
c("no_feature","ambiguous","too_low_aQual",
"not_aligned","alignment_not_unique")
cpms = cpm(counts)
counts.all <- counts[,order(samples$condition)]
colnames(counts.all) = samples$shortname[order(samples$condition)]
head(counts.all)
## CT.PA.1 CT.PA.2 CT.SI.5 CT.SI.7 KD.PA.3 KD.PA.4 KD.SI.6
## FBgn0000008 76 71 137 82 87 68 115
## FBgn0000014 2 4 1 1 9 1 0
## FBgn0000015 1 2 0 0 3 1 2
## FBgn0000017 3498 3087 7014 3926 3029 3264 4322
## FBgn0000018 240 306 613 485 288 307 528
## FBgn0000022 0 0 1 0 0 0 0
# Keep only genes with at least 1 read per million, for the smallest
# number of replicates in treatment
keep = rowSums(cpms > 1) >= 3 & !noint
counts = counts.all[keep,]
#colnames(counts) = samples$shortname
head(counts, 5)
## CT.PA.1 CT.PA.2 CT.SI.5 CT.SI.7 KD.PA.3 KD.PA.4 KD.SI.6
## FBgn0000008 76 71 137 82 87 68 115
## FBgn0000017 3498 3087 7014 3926 3029 3264 4322
## FBgn0000018 240 306 613 485 288 307 528
## FBgn0000032 611 672 1479 1351 694 757 1361
## FBgn0000042 40048 49144 97565 99372 70574 72850 95760
d = DGEList(counts = counts, group = samples$condition[order(samples$condition)])
str(d)
## Formal class 'DGEList' [package "edgeR"] with 1 slot
## ..@ .Data:List of 2
## .. ..$ : num [1:7196, 1:7] 76 3498 240 611 40048 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:7196] "FBgn0000008" "FBgn0000017" "FBgn0000018" "FBgn0000032" ...
## .. .. .. ..$ : chr [1:7] "CT.PA.1" "CT.PA.2" "CT.SI.5" "CT.SI.7" ...
## .. ..$ :'data.frame': 7 obs. of 3 variables:
## .. .. ..$ group : Factor w/ 2 levels "CTL","KD": 1 1 1 1 2 2 2
## .. .. ..$ lib.size : num [1:7] 8397136 9909691 19087995 12812818 9664838 ...
## .. .. ..$ norm.factors: num [1:7] 1 1 1 1 1 1 1
d = calcNormFactors(d)
str(d)
## Formal class 'DGEList' [package "edgeR"] with 1 slot
## ..@ .Data:List of 2
## .. ..$ : num [1:7196, 1:7] 76 3498 240 611 40048 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:7196] "FBgn0000008" "FBgn0000017" "FBgn0000018" "FBgn0000032" ...
## .. .. .. ..$ : chr [1:7] "CT.PA.1" "CT.PA.2" "CT.SI.5" "CT.SI.7" ...
## .. ..$ :'data.frame': 7 obs. of 3 variables:
## .. .. ..$ group : Factor w/ 2 levels "CTL","KD": 1 1 1 1 2 2 2
## .. .. ..$ lib.size : num [1:7] 8397136 9909691 19087995 12812818 9664838 ...
## .. .. ..$ norm.factors: num [1:7] 0.97 0.965 1.001 1.015 0.997 ...
d$samples
## group lib.size norm.factors
## CT.PA.1 CTL 8397136 0.9702373
## CT.PA.2 CTL 9909691 0.9652457
## CT.SI.5 CTL 19087995 1.0009795
## CT.SI.7 CTL 12812818 1.0145053
## KD.PA.3 KD 9664838 0.9973330
## KD.PA.4 KD 10325828 1.0146062
## KD.SI.6 KD 15324886 1.0391230
par(pty="s")
d.mds <- plotMDS(d, labels = samples$shortname[order(samples$condition)],
col = c("darkgreen","blue")[factor(samples$condition[order(samples$condition)])],
gene.selection = "common", xlim=c(-1.1, 1.1))
# Check the normalization
library(ggplot2)
library(GGally)
d$counts <- data.frame(d$counts) # Needs to be a data frame
d$lcounts <- log(d$counts+1)
ggparcoord(d$lcounts, columns=1:7, boxplot=TRUE, scale="globalminmax",
showPoints=FALSE, alphaLines=0) +
xlab("") + ylab("log(cpm)") +
theme_bw()
# ggparcoord(d$lcounts, columns=1:7, alphaLines=0.1)
# Use trimmed mean normalization
d = calcNormFactors(d, method="TMM")
nc = data.frame(cpm(d, normalized.lib.sizes = TRUE, log=TRUE))
ggparcoord(nc, columns=1:7, boxplot=TRUE, scale="globalminmax", showPoints=FALSE,
alphaLines=0) +
xlab("") + ylab("log(cpm)") +
theme_bw()
# Check that the MDS was conducted on the normalized data
d.mds2 <- plotMDS(nc, labels = samples$shortname[order(samples$condition)],
col = c("darkgreen","blue")[factor(samples$condition[order(samples$condition)])],
gene.selection = "common", xlim=c(-1.1, 1.1))
# Scatterplot matrix of normalized data
ggscatmat(nc)
# Full parallel coordinate plot
ggparcoord(nc, columns=1:7, scale="globalminmax", alphaLines=0.5) +
xlab("") + ylab("log(cpm)") +
theme_bw()
ggparcoord(nc, columns=1:7, scale="globalminmax", alphaLines=0.01) +
xlab("") + ylab("log(cpm)") +
theme_bw()
# Porcupine plot
ggplot(nc) + geom_segment(aes(x=CT.PA.1, xend=CT.PA.2, y=KD.PA.3, yend=KD.PA.4)) +
theme_bw() + theme(aspect.ratio=1)
# Check the mean and variance relationship
# For the Poisson model, the mean = variance
# RNA seq tends to be more overdispersed, variance is larger than expected
# Over-dispersed leads to fitting a negative binomial model
# Common dispersion would be used if all genes assumed to have same variance
d = estimateCommonDisp(d)
d$common.dispersion
## [1] 0.03329083
# Tagwise dispersion is a weighted average of individual gene dispersion
# with common dispersion
d = estimateTagwiseDisp(d)
summary(d$tagwise.dispersion)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01647 0.02187 0.02589 0.03540 0.03599 1.67900
d$prior.n
## [1] 2
plotMeanVar(d, show.tagwise.vars = TRUE, NBline = TRUE)
plotBCV(d)
# Examine the dispersions, in relation to the means
mv <- binMeanVar(d, group = d$samples$group)
# $means are the means for each gene
# $vars are the pooled variances for each gene
qplot(mv$means, mv$vars, alpha=I(0.5)) + scale_x_log10() + scale_y_log10() +
geom_smooth(method="lm") + theme_bw() + theme(aspect.ratio=1)
qplot(mv$means, d$tagwise.dispersion) + scale_x_log10() + scale_y_log10() +
geom_smooth() + theme_bw() + theme(aspect.ratio=1)
qplot(mv$vars, d$tagwise.dispersion) + scale_x_log10() + scale_y_log10() +
geom_smooth() + theme_bw() + theme(aspect.ratio=1)
# Test for differential expression (‘classic’ edgeR)
de = exactTest(d, pair = c("CTL","KD"))
tt = topTags(de, n = nrow(d), sort.by="none")
nc.sig <- data.frame(gene=rownames(nc), nc, tt)
nc.sig$sig05 <- ifelse(nc.sig$FDR < 0.05, "S", "NS")
nc.sig$sig01 <- ifelse(nc.sig$FDR < 0.01, "S", "NS")
# Porcupine plot with significance
ggplot(nc.sig) + geom_segment(aes(x=CT.PA.1, xend=CT.PA.2, y=KD.PA.3,
yend=KD.PA.4, color=sig05)) +
scale_color_manual(values=c("S"="red", "NS"="grey90")) +
xlab("CT") + ylab("KD") +
theme_bw() + theme(aspect.ratio=1)
ggplot(nc.sig) + geom_segment(aes(x=CT.PA.1, xend=CT.PA.2, y=KD.PA.3,
yend=KD.PA.4, color=sig01)) +
scale_color_manual(values=c("S"="red", "NS"="grey90")) +
xlab("CT") + ylab("KD") +
theme_bw() + theme(aspect.ratio=1)
# MV plots
nc.sig$sig01 <- factor(nc.sig$sig01, levels=c("NS","S"))
ggscatmat(nc.sig, columns=2:8, color="sig01") +
scale_color_manual(values=c("S"="red", "NS"="white"))
ggscatmat(nc.sig[nc.sig$sig01=="S",], columns=2:8)
ggparcoord(nc.sig, columns=2:8, scale="globalminmax", alphaLines=0.5,
groupColumn="sig01") +
scale_color_manual(values=c("S"="red", "NS"="grey90")) +
xlab("") + ylab("log(cpm)") +
theme_bw()
ggparcoord(nc.sig[nc.sig$sig01=="S",], columns=2:8, scale="globalminmax",
alphaLines=0.5) +
xlab("") + ylab("log(cpm)") +
theme_bw()
# Interaction plots of top genes
library(tidyr)
library(dplyr)
nc.sig <- arrange(nc.sig, PValue)
g1 <- gather(nc.sig[1,2:8])
g1$trt <- substr(g1$key, 1, 2)
g1$read <- substr(g1$key, 4, 5)
g1.mean <- summarise(group_by(g1, trt), m=mean(value))
qplot(trt, value, data=g1, xlab="Treatment", ylab="logCPM", colour=read,
size=I(5), alpha=I(0.5)) +
annotate("segment", x=1, xend=2, y=g1.mean$m[1],
yend=g1.mean$m[2], colour="grey80") +
ggtitle(nc.sig$gene[1]) +
theme_bw() + theme(aspect.ratio=1)
table(nc.sig$sig01)
##
## NS S
## 6882 314
g314 <- gather(nc.sig[314,2:8])
g314$trt <- substr(g314$key, 1, 2)
g314$read <- substr(g314$key, 4, 5)
g314.mean <- summarise(group_by(g314, trt), m=mean(value))
qplot(trt, value, data=g314, xlab="Treatment", ylab="logCPM", colour=read,
size=I(5), alpha=I(0.5)) +
annotate("segment", x=1, xend=2, y=g314.mean$m[1],
yend=g314.mean$m[2], colour="grey80") +
ggtitle(nc.sig$gene[314]) +
theme_bw() + theme(aspect.ratio=1)
# To check the effect size, visually, we are going to scramble the
# labels, and re-run the significance testing
dp <- d
ncp <- nc
dp$samples$group <- c("CTL","KD","CTL","KD","CTL","KD","CTL")
dp$samples$group
## [1] "CTL" "KD" "CTL" "KD" "CTL" "KD" "CTL"
dep = exactTest(dp, pair = c("CTL","KD"))
ttp = topTags(dep, n = nrow(dp), sort.by="none")
ncp.sig <- data.frame(gene=rownames(ncp), ncp, ttp)
ncp.sig$sig05 <- ifelse(ncp.sig$FDR < 0.05, "S", "NS")
ncp.sig$sig01 <- ifelse(ncp.sig$FDR < 0.01, "S", "NS")
ncp.sig <- arrange(ncp.sig, PValue)
g1 <- gather(ncp.sig[1,2:8])
g1$trt <- dp$samples$group
g1.mean <- summarise(group_by(g1, trt), m=mean(value))
qplot(trt, value, data=g1, xlab="Treatment", ylab="logCPM",
size=I(5), alpha=I(0.5)) +
annotate("segment", x=1, xend=2, y=g1.mean$m[1],
yend=g1.mean$m[2], colour="grey80") +
ggtitle(ncp.sig$gene[314]) +
theme_bw() + theme(aspect.ratio=1)
table(ncp.sig$sig01)
##
## NS
## 7196
g314 <- gather(ncp.sig[314,2:8])
g314$trt <- dp$samples$group
g314.mean <- summarise(group_by(g314, trt), m=mean(value))
qplot(trt, value, data=g314, xlab="Treatment", ylab="logCPM",
size=I(5), alpha=I(0.5)) +
annotate("segment", x=1, xend=2, y=g314.mean$m[1],
yend=g314.mean$m[2], colour="grey80") +
ggtitle(ncp.sig$gene[314]) +
theme_bw() + theme(aspect.ratio=1)
Anders, S., McCarthy, D. J., Chen, Y., Okoniewski, M., Smyth, G. K., Huber, W. and Robinson, M. D. (2013) “Count-based differential expression analysis of RNA sequencing data using R and Bioconductor”, Nature Protocols, 8(9):1765-1786, doi:10.1038/nprot.2013.099.